{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "58033684",
   "metadata": {
    "papermill": {
     "duration": 0.02106,
     "end_time": "2022-01-10T17:24:00.457515",
     "exception": false,
     "start_time": "2022-01-10T17:24:00.436455",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Output RDMBS Alchemy"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "54c523c6",
   "metadata": {
    "papermill": {
     "duration": 0.022015,
     "end_time": "2022-01-10T17:24:00.502963",
     "exception": false,
     "start_time": "2022-01-10T17:24:00.480948",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "This component pushes data to any RDMBS supported by SQLAlchemy as CSV on a given table. Parameters like host, database, user, password and table name need to be set.\n",
    "\n",
    "Currently only append mode is supported via the execute_batch helper. Optionally, you can flush (truncate) the table before.\n",
    "\n",
    "The current implementation loads all data into main memory (via pandas) first. Better implementations can be found below, PRs welcome!\n",
    "\n",
    "https://hakibenita.com/fast-load-data-python-postgresql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3cb2d04",
   "metadata": {
    "papermill": {
     "duration": 1.367526,
     "end_time": "2022-01-10T17:24:01.890702",
     "exception": false,
     "start_time": "2022-01-10T17:24:00.523176",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip install sqlalchemy==1.4.23 pandas==1.3.1 psycopg2-binary==2.9.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf1d5b7b",
   "metadata": {
    "papermill": {
     "duration": 0.61882,
     "end_time": "2022-01-10T17:24:02.534573",
     "exception": false,
     "start_time": "2022-01-10T17:24:01.915753",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "from sqlalchemy import create_engine\n",
    "from sqlalchemy.orm import sessionmaker\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "caa39d79",
   "metadata": {
    "papermill": {
     "duration": 0.030883,
     "end_time": "2022-01-10T17:24:02.586366",
     "exception": false,
     "start_time": "2022-01-10T17:24:02.555483",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# data to load (expects CSV file with header)\n",
    "data_csv = os.environ.get('data_csv', 'data.csv')\n",
    "\n",
    "# type of database server (sqlalchemy dialect), e.g. postgresql\n",
    "db_type = os.environ.get('db_type', 'postgresql')\n",
    "\n",
    "# hostname of database server\n",
    "host = os.environ.get('host')\n",
    "\n",
    "# database name\n",
    "database = os.environ.get('database')\n",
    "\n",
    "# db user\n",
    "user = os.environ.get('user')\n",
    "\n",
    "# db password\n",
    "password = os.environ.get('password')\n",
    "\n",
    "# db port\n",
    "port = int(os.environ.get('port', 5432))\n",
    "\n",
    "# schema name\n",
    "schema = os.environ.get('schema')\n",
    "\n",
    "# table name\n",
    "table = os.environ.get('table')\n",
    "\n",
    "# drop table before insert\n",
    "drop = bool(os.environ.get('drop', True))\n",
    "\n",
    "# delimiter\n",
    "delimiter = os.environ.get('delimiter', ',')\n",
    "\n",
    "# character encoding\n",
    "encoding = os.environ.get('encoding', 'cp1252')\n",
    "\n",
    "# temporal data storage for local execution\n",
    "data_dir = os.environ.get('data_dir', '../../data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8effb1da",
   "metadata": {
    "papermill": {
     "duration": 0.030158,
     "end_time": "2022-01-10T17:24:02.640771",
     "exception": false,
     "start_time": "2022-01-10T17:24:02.610613",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "parameters = list(\n",
    "    map(lambda s: re.sub('$', '\"', s),\n",
    "        map(\n",
    "            lambda s: s.replace('=', '=\"'),\n",
    "            filter(\n",
    "                lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\\/A-Za-z0-9]*', s)),\n",
    "                sys.argv\n",
    "            )\n",
    "    )))\n",
    "\n",
    "for parameter in parameters:\n",
    "    logging.warning('Parameter: ' + parameter)\n",
    "    exec(parameter)\n",
    "\n",
    "drop = bool(drop)\n",
    "port = int(port)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94d75ae6",
   "metadata": {
    "papermill": {
     "duration": 0.056927,
     "end_time": "2022-01-10T17:24:02.720893",
     "exception": false,
     "start_time": "2022-01-10T17:24:02.663966",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "engine = create_engine(f'{db_type}://{user}:{password}@{host}:{port}/{database}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0720dd78",
   "metadata": {
    "papermill": {
     "duration": 0.725327,
     "end_time": "2022-01-10T17:24:03.470423",
     "exception": false,
     "start_time": "2022-01-10T17:24:02.745096",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if drop:\n",
    "    with engine.connect() as con:\n",
    "        con.execution_options(autocommit=True).execute(f'DROP TABLE IF EXISTS {schema}.{table};')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5307fb47",
   "metadata": {
    "papermill": {
     "duration": 5.03273,
     "end_time": "2022-01-10T17:24:08.525681",
     "exception": false,
     "start_time": "2022-01-10T17:24:03.492951",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "Session = sessionmaker(bind=engine)\n",
    "\n",
    "with Session() as session:\n",
    "    df = pd.read_csv(data_dir + data_csv, encoding=encoding, delimiter=delimiter)\n",
    "    df.columns= df.columns.str.lower()\n",
    "    df.columns = df.columns.str.replace('[ä,ö,ü]', '[ae,oe,ue]')\n",
    "    df.to_sql(table, con=engine, if_exists='append', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "848c4a95-e974-4754-aff3-42f215491172",
   "metadata": {
    "papermill": {
     "duration": 0.029014,
     "end_time": "2022-01-10T17:24:08.580471",
     "exception": false,
     "start_time": "2022-01-10T17:24:08.551457",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#import pandas as pd\n",
    "#df = pd.read_csv('../../data/12613-0004_flat.csv', encoding='cp1252', delimiter=';')\n",
    "#df.columns= df.columns.str.lower()\n",
    "#df.columns = df.columns.str.replace('[ä,ö,ü]', '[ae,oe,ue]')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 9.625493,
   "end_time": "2022-01-10T17:24:09.011383",
   "environment_variables": {},
   "exception": null,
   "input_path": "/home/romeokienzler/gitco/claimed/component-library/output/output-rdbms-sqlalchemy.ipynb",
   "output_path": "/home/romeokienzler/gitco/claimed/component-library/output/output-rdbms-sqlalchemy.ipynb",
   "parameters": {},
   "start_time": "2022-01-10T17:23:59.385890",
   "version": "2.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
